In [ ]:
#import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
# Ignore Warnings
import warnings
warnings.filterwarnings("ignore")
In [2]:
#load dataset
df = pd.read_csv("C:\\Users\\vaish\\Downloads\\archive (1)\\cybersecurity_attacks.csv")
#Checkout the dataset
df.head().T
df.columns
print(f"There are {df.shape[0]}, row and {df.shape[1]} columns in the dataset")
df.info()
#check missing values/null values
df.isnull().sum().sort_values(ascending=False)
#check missing values by percentage
df.isnull().sum() / len(df) * 100
There are 40000, row and 25 columns in the dataset
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 25 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Timestamp               40000 non-null  object 
 1   Source IP Address       40000 non-null  object 
 2   Destination IP Address  40000 non-null  object 
 3   Source Port             40000 non-null  int64  
 4   Destination Port        40000 non-null  int64  
 5   Protocol                40000 non-null  object 
 6   Packet Length           40000 non-null  int64  
 7   Packet Type             40000 non-null  object 
 8   Traffic Type            40000 non-null  object 
 9   Payload Data            40000 non-null  object 
 10  Malware Indicators      20000 non-null  object 
 11  Anomaly Scores          40000 non-null  float64
 12  Alerts/Warnings         19933 non-null  object 
 13  Attack Type             40000 non-null  object 
 14  Attack Signature        40000 non-null  object 
 15  Action Taken            40000 non-null  object 
 16  Severity Level          40000 non-null  object 
 17  User Information        40000 non-null  object 
 18  Device Information      40000 non-null  object 
 19  Network Segment         40000 non-null  object 
 20  Geo-location Data       40000 non-null  object 
 21  Proxy Information       20149 non-null  object 
 22  Firewall Logs           20039 non-null  object 
 23  IDS/IPS Alerts          19950 non-null  object 
 24  Log Source              40000 non-null  object 
dtypes: float64(1), int64(3), object(21)
memory usage: 7.6+ MB
Out[2]:
Timestamp                  0.0000
Source IP Address          0.0000
Destination IP Address     0.0000
Source Port                0.0000
Destination Port           0.0000
Protocol                   0.0000
Packet Length              0.0000
Packet Type                0.0000
Traffic Type               0.0000
Payload Data               0.0000
Malware Indicators        50.0000
Anomaly Scores             0.0000
Alerts/Warnings           50.1675
Attack Type                0.0000
Attack Signature           0.0000
Action Taken               0.0000
Severity Level             0.0000
User Information           0.0000
Device Information         0.0000
Network Segment            0.0000
Geo-location Data          0.0000
Proxy Information         49.6275
Firewall Logs             49.9025
IDS/IPS Alerts            50.1250
Log Source                 0.0000
dtype: float64
In [3]:
#Handle the Missing Values 
#If the Alert Triggered is present, then it's a yes, else it's a no
df['Alerts/Warnings'] = df['Alerts/Warnings'].apply(lambda x: 'yes' if x == 'Alert Triggered' else 'no')
#If the Malware Indicators is present, then it's a No, else it's a No Detection.
df['Malware Indicators'] = df['Malware Indicators'].apply(lambda x: 'No Detection' if pd.isna(x) else x)
#If Proxy Information is missing, it is assumed that there is no proxy
df['Proxy Information'] = df['Proxy Information'].apply(lambda x: 'No proxy' if pd.isna(x) else x)
#If Firewall Logs is missing, it is assumed that there is no data
df['Firewall Logs'] = df['Firewall Logs'].apply(lambda x: 'No Data' if pd.isna(x) else x)
#If IDS/IPS Alerts is "No Data", then it means that the alert was not generated by IDS/IPS.
df['IDS/IPS Alerts'] = df['IDS/IPS Alerts'].apply(lambda x: 'No Data' if pd.isna(x) else x)
#Missing values removed
df.isnull().sum().sort_values(ascending=False)
Out[3]:
Timestamp                 0
Attack Type               0
IDS/IPS Alerts            0
Firewall Logs             0
Proxy Information         0
Geo-location Data         0
Network Segment           0
Device Information        0
User Information          0
Severity Level            0
Action Taken              0
Attack Signature          0
Alerts/Warnings           0
Source IP Address         0
Anomaly Scores            0
Malware Indicators        0
Payload Data              0
Traffic Type              0
Packet Type               0
Packet Length             0
Protocol                  0
Destination Port          0
Source Port               0
Destination IP Address    0
Log Source                0
dtype: int64
In [4]:
#Explore the Device Information Column
# Extract 'Device'
df['Browser'] = df['Device Information'].str.split('/').str[0]

df['Browser']
import re
# OS and device patterns to search for
patterns = [
    r'Windows',
    r'Linux',
    r'Android',
    r'iPad',
    r'iPod',
    r'iPhone',
    r'Macintosh',
]

def extract_device_or_os(user_agent):
    for pattern in patterns:
        match = re.search(pattern, user_agent, re.I)  # re.I makes the search case-insensitive
        if match:
            return match.group()
    return 'Unknown'  # Return 'Unknown' if no patterns match

# Extract device or OS
df['Device/OS'] = df['Device Information'].apply(extract_device_or_os)

df['Browser'].value_counts()
#Dropping the Device Information Column
df = df.drop('Device Information', axis = 1)
In [5]:
def extract_time_features(df, Timestamp):
    # Convert timestamp column to datetime if it's not already
    df[Timestamp] = pd.to_datetime(df[Timestamp])
    
    # Extract time features
    df['Year'] = df[Timestamp].dt.year
    df['Month'] = df[Timestamp].dt.month
    df['Day'] = df[Timestamp].dt.day
    df['Hour'] = df[Timestamp].dt.hour
    df['Minute'] = df[Timestamp].dt.minute
    df['Second'] = df[Timestamp].dt.second
    df['DayOfWeek'] = df[Timestamp].dt.dayofweek
    
    return df
# Assuming df is your DataFrame
# Call the function and store the result in a new DataFrame
new_df = extract_time_features(df, 'Timestamp')

# Check if new columns are created
print(new_df.head())
# Assuming df is your DataFrame
# Call the function and store the result in a new DataFrame
new_df = extract_time_features(df, 'Timestamp')

# Check if new columns are created
print(new_df.head())
df.head().T
df.describe(include = 'object').T
df.columns
            Timestamp Source IP Address Destination IP Address  Source Port  \
0 2023-05-30 06:33:58     103.216.15.12           84.9.164.252        31225   
1 2020-08-26 07:08:30    78.199.217.198         66.191.137.154        17245   
2 2022-11-13 08:23:25      63.79.210.48          198.219.82.17        16811   
3 2023-07-02 10:38:46     163.42.196.10        101.228.192.255        20018   
4 2023-07-16 13:11:07     71.166.185.76        189.243.174.238         6131   

   Destination Port Protocol  Packet Length Packet Type Traffic Type  \
0             17616     ICMP            503        Data         HTTP   
1             48166     ICMP           1174        Data         HTTP   
2             53600      UDP            306     Control         HTTP   
3             32534      UDP            385        Data         HTTP   
4             26646      TCP           1462        Data          DNS   

                                        Payload Data  ... Log Source  Browser  \
0  Qui natus odio asperiores nam. Optio nobis ius...  ...     Server  Mozilla   
1  Aperiam quos modi officiis veritatis rem. Omni...  ...   Firewall  Mozilla   
2  Perferendis sapiente vitae soluta. Hic delectu...  ...   Firewall  Mozilla   
3  Totam maxime beatae expedita explicabo porro l...  ...   Firewall  Mozilla   
4  Odit nesciunt dolorem nisi iste iusto. Animi v...  ...   Firewall  Mozilla   

   Device/OS  Year Month Day Hour Minute Second DayOfWeek  
0    Windows  2023     5  30    6     33     58         1  
1    Windows  2020     8  26    7      8     30         2  
2    Windows  2022    11  13    8     23     25         6  
3  Macintosh  2023     7   2   10     38     46         6  
4    Windows  2023     7  16   13     11      7         6  

[5 rows x 33 columns]
            Timestamp Source IP Address Destination IP Address  Source Port  \
0 2023-05-30 06:33:58     103.216.15.12           84.9.164.252        31225   
1 2020-08-26 07:08:30    78.199.217.198         66.191.137.154        17245   
2 2022-11-13 08:23:25      63.79.210.48          198.219.82.17        16811   
3 2023-07-02 10:38:46     163.42.196.10        101.228.192.255        20018   
4 2023-07-16 13:11:07     71.166.185.76        189.243.174.238         6131   

   Destination Port Protocol  Packet Length Packet Type Traffic Type  \
0             17616     ICMP            503        Data         HTTP   
1             48166     ICMP           1174        Data         HTTP   
2             53600      UDP            306     Control         HTTP   
3             32534      UDP            385        Data         HTTP   
4             26646      TCP           1462        Data          DNS   

                                        Payload Data  ... Log Source  Browser  \
0  Qui natus odio asperiores nam. Optio nobis ius...  ...     Server  Mozilla   
1  Aperiam quos modi officiis veritatis rem. Omni...  ...   Firewall  Mozilla   
2  Perferendis sapiente vitae soluta. Hic delectu...  ...   Firewall  Mozilla   
3  Totam maxime beatae expedita explicabo porro l...  ...   Firewall  Mozilla   
4  Odit nesciunt dolorem nisi iste iusto. Animi v...  ...   Firewall  Mozilla   

   Device/OS  Year Month Day Hour Minute Second DayOfWeek  
0    Windows  2023     5  30    6     33     58         1  
1    Windows  2020     8  26    7      8     30         2  
2    Windows  2022    11  13    8     23     25         6  
3  Macintosh  2023     7   2   10     38     46         6  
4    Windows  2023     7  16   13     11      7         6  

[5 rows x 33 columns]
Out[5]:
Index(['Timestamp', 'Source IP Address', 'Destination IP Address',
       'Source Port', 'Destination Port', 'Protocol', 'Packet Length',
       'Packet Type', 'Traffic Type', 'Payload Data', 'Malware Indicators',
       'Anomaly Scores', 'Alerts/Warnings', 'Attack Type', 'Attack Signature',
       'Action Taken', 'Severity Level', 'User Information', 'Network Segment',
       'Geo-location Data', 'Proxy Information', 'Firewall Logs',
       'IDS/IPS Alerts', 'Log Source', 'Browser', 'Device/OS', 'Year', 'Month',
       'Day', 'Hour', 'Minute', 'Second', 'DayOfWeek'],
      dtype='object')
In [6]:
# Checking the Day Column ploting with plotly
plt = px.histogram(df, x = 'Day', color = 'Malware Indicators', title = 'Number of Malware Attacks by Day')
plt.show()
# month Distribution
plt = px.histogram(df, x = 'Month', title = 'Month')
plt.show()
# Checking the Month Column ploting with plotly
plt = px.histogram(df, x = 'Month', color = 'Malware Indicators', title = 'Number of Malware Attacks by Month')
plt.show()
# Year Distrition
plt = px.histogram(df, x='Year', title = 'Year')
plt.show()
# Checking the year Column ploting with plotly
plt = px.histogram(df, x = 'Year', color = 'Malware Indicators', title = 'Number of Malware Attacks by Year')
plt.show()
# Checking the Protocol distribution with Bar Chart Using Plotly
plt = px.histogram(df, x = 'Protocol', color = 'Malware Indicators', title = 'Number of Malware Attacks by Protocol')
plt.show()
# Traffic Distribution
plt = px.pie(df, names = 'Traffic Type', title = 'Traffic Distribution')
plt.show()
# Ploting the Traffic Type distribution with Bar Chart Using Plotly
plt = px.histogram(df, x = 'Traffic Type', color = 'Malware Indicators', title = 'Number of Malware Attacks by Traffic Type')
plt.show()
# Attack Type Distribution
plt = px.pie(df, names = 'Attack Type', title = 'Attack Type Distribution')
plt.show()
# Checking the attack types distribution with Bar Chart Using Plotly
plt = px.histogram(df, x='Attack Type', color='Traffic Type', title='Number of Malware Attacks by Attack Type')
plt.show()
# Browsers Distribution
plt = px.pie(df, names = 'Browser', title = 'Browser Distribution')
plt.show()
# Platform Distribution
plt = px.pie(df, names = 'Device/OS', title = 'Platform Distribution')
plt.show()
# Platform Distribution with Bar Chart 

plt = px.histogram(df, x ='Device/OS', color= 'Browser', title = 'Platform Distribution')
plt.show()
# Checking the Browser and Devices with Attack Type distribution with Bar Chart Using Plotly
plt = px.histogram(df, x= 'Device/OS', color = 'Attack Type', title = 'Number of Malware Attacks by Browser and Devices')
plt.show()
# checking the browser against the attack type
plt = px.histogram(df, x= 'Browser', color='Attack Type', title= 'Number of Attacks by Browser')
plt.show()
# Log Source Distribution
plt = px.histogram(df, x='Log Source', title='Log Source')
plt.show()
# Log Source Distribution
plt = px.histogram(df, x='Action Taken', title='Action Taken')
plt.show()
# Log Source Distribution
plt = px.histogram(df, x='Action Taken', color='Attack Type', title='Log Source')
plt.show()
# Log Source Distribution
plt = px.histogram(df, x='Log Source', color='Attack Type', title='Log Source')
plt.show()
In [7]:
#Check the Packet Length of Malware, Intrusion and DDoS and compare
import plotly.graph_objs as go

# Filter data for each attack type
malware_data = df[df['Attack Type'] == 'Malware']['Packet Length']
intrusion_data = df[df['Attack Type'] == 'Intrusion']['Packet Length']
ddos_data = df[df['Attack Type'] == 'DDoS']['Packet Length']

# Create histograms for each attack type
malware_histogram = go.Histogram(x=malware_data, name='Malware', opacity=0.7)
intrusion_histogram = go.Histogram(x=intrusion_data, name='Intrusion', opacity=0.7)
ddos_histogram = go.Histogram(x=ddos_data, name='DDoS', opacity=0.7)

# Create layout
layout = go.Layout(title='Packet Length Distribution for Different Attack Types',
                   xaxis=dict(title='Packet Length'),
                   yaxis=dict(title='Frequency'))

# Create figure
fig = go.Figure(data=[malware_histogram, intrusion_histogram, ddos_histogram], layout=layout)

# Show plot
fig.show()
In [ ]: